Goal: Top nodes and the nodes they transact with
In [1]:
from IPython.display import display, clear_output
import time
import sys
import logging
import collections
import networkx as nx
import matplotlib
import config
import utils
import graph_analyzer
logging.basicConfig(level=logging.INFO)
EMPTY_OUTPUT = -1
def log(msg):
print msg
sys.stdout.flush()
In [72]:
COIN_CODE = "PPC"
N_CLUSTERS = 10
CSV_FILE = config.data_dir + "csv_exports/edges-%s.csv" % COIN_CODE
INPUT_COUNTS_FILE = config.data_dir + "txin/txin-%s.csv" % COIN_CODE
OUTPUT_COUNTS_FILE = config.data_dir + "txout/txout-%s.csv" % COIN_CODE
In [73]:
log("Generating graph analyzer")
analyzer = graph_analyzer.GraphAnalyzer(COIN_CODE)
In [74]:
log("Calculating input and output counts")
input_counts = collections.defaultdict(int)
output_counts = collections.defaultdict(int)
for row in utils.get_csvreader(INPUT_COUNTS_FILE):
if len(row) > 1:
input_counts[int(row[0])] = int(row[1])
for row in utils.get_csvreader(OUTPUT_COUNTS_FILE):
if len(row) > 1:
output_counts[int(row[0])] = int(row[1])
In [75]:
log("Determining top clusters")
top_clusters_data = analyzer.richest_n_clusters(N_CLUSTERS) # top cluster, value pairs
top_clusters = [x[0] for x in top_clusters_data]
In [76]:
log("Creating clusters data structure")
clusters = collections.defaultdict(int) # key is cluster_id, value is cluster value at latest time
for cluster_id, value in top_clusters_data:
clusters[cluster_id] = value
In [77]:
log("Adding neighboring clusters and calculating edge weights")
# dictionary with (send_cluster_id, recv_cluster_id) as keys,
# and total value transferred as values
edge_weights = collections.defaultdict(int)
def update_edge_weights(tx_inputs, tx_outputs, edge_weights):
# correct for multiple counts
input_set = set(tx_inputs.keys())
output_set = set(tx_outputs.keys())
for pubkey in tx_inputs:
tx_inputs[pubkey] /= output_counts[last_tx]
for pubkey in tx_outputs:
tx_outputs[pubkey] /= input_counts[last_tx]
# attempt to determine a sender cluster_id
sender_cluster_id = analyzer.cluster_for_pubkey(tx_inputs.keys()[0])
if not sender_cluster_id:
return
only_output = output_set.difference(input_set)
for pubkey in only_output:
recv_cluster_id = analyzer.cluster_for_pubkey(pubkey)
if recv_cluster_id \
and (sender_cluster_id != recv_cluster_id) \
and ((sender_cluster_id in top_clusters) or (recv_cluster_id in top_clusters)):
if sender_cluster_id not in clusters:
clusters[sender_cluster_id] = analyzer.balance_for_cluster(sender_cluster_id)
if recv_cluster_id not in clusters:
clusters[recv_cluster_id] = analyzer.balance_for_cluster(recv_cluster_id)
edge_weights[(sender_cluster_id, recv_cluster_id)] += tx_outputs[pubkey]
last_tx = None
tx_inputs = collections.defaultdict(int)
tx_outputs = collections.defaultdict(int)
counter = 0
for row in utils.get_csvreader(CSV_FILE):
if len(row) < 2:
continue
if counter % 10000 == 0:
clear_output(wait=True)
print(counter)
sys.stdout.flush()
tx_id = int(row[0])
# detect coingen
try:
pubkey_input = int(row[1])
value_input = int(row[2])
except:
pubkey_input = config.COINGEN_ADDRESS
value_input = 0
try:
pubkey_output = int(row[3])
except:
pubkey_output = EMPTY_OUTPUT
value_output = int(row[4])
if tx_id != last_tx and tx_inputs:
# end of reading a transaction series. Update edges
update_edge_weights(tx_inputs, tx_outputs, edge_weights)
tx_inputs = collections.defaultdict(int)
tx_outputs = collections.defaultdict(int)
last_tx = tx_id
tx_inputs[pubkey_input] += value_input
if pubkey_output != EMPTY_OUTPUT:
tx_outputs[pubkey_output] += value_output
counter += 1
update_edge_weights(tx_inputs, tx_outputs, edge_weights)
In [78]:
G = nx.DiGraph()
log("Adding nodes")
G.add_nodes_from(clusters.keys())
log("Adding edges")
counter = 0
for key in edge_weights:
if counter % 1000 == 0:
clear_output(wait=True)
print(counter)
sys.stdout.flush()
try:
G.add_weighted_edges_from([(key[0], key[1], edge_weights[key])])
except:
pass
counter += 1
log("Calculating node sizes")
node_sizes = []
max_value = max(clusters.values())
for value in clusters.values():
node_sizes.append(int((value/float(max_value)) * 10000))
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(20,20)
log("Calculating layout")
pos = nx.pygraphviz_layout(G)
# only print connected nodes
connected_nodes = {}
for sender, recvr in edge_weights:
connected_nodes[sender] = True
connected_nodes[recvr] = True
log("Drawing")
nx.draw_networkx(G, pos=pos, with_labels=False, nodelist=connected_nodes.keys(), node_size=node_sizes, alpha=0.8, edgelist=[])
log("Drawing edges")
max_edge_weight = max(edge_weights.values())
edge_weights_scaled = {}
counter = 0
for key in edge_weights:
if counter % 100 == 0:
clear_output(wait=True)
print(counter)
sys.stdout.flush()
nx.draw_networkx_edges(G, pos, edgelist=[key], width=int((edge_weights[key]/float(max_edge_weight)) * 100), alpha=0.4, arrows=False)
counter += 1
fig.savefig('graph.png',dpi=400)
In [ ]: